import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import copy
import sys
print("Python version")
print (sys.version)
Python version 3.7.9 (default, Aug 31 2020, 17:10:11) [MSC v.1916 64 bit (AMD64)]
attendance_dataset=pd.read_csv('required dataset/attendance.csv')
matches=pd.read_csv('required dataset/matches.csv')
stadiums=pd.read_csv('required dataset/stadiums.csv')
players=pd.read_csv('required dataset/players.csv')
squads=pd.read_csv('required dataset/squads.csv')
tournaments=pd.read_csv("required dataset/tournaments.csv")
goals=pd.read_csv("required dataset/goals.csv")
matches.rename(columns={'home_team_name':'home_team','away_team_name':'away_team','match_date':'Date'},inplace=True)
home_teams_matches=matches.home_team.unique().tolist()
away_teams_matches=matches.away_team.unique().tolist()
all_teams_matches=copy.deepcopy(home_teams_matches)
all_teams_matches.extend(away_teams_matches)
all_teams_matches_set=set(all_teams_matches)
home_teams_attendance=attendance_dataset.home_team.unique().tolist()
away_teams_attendance=attendance_dataset.away_team.unique().tolist()
all_teams_attendance=copy.deepcopy(home_teams_attendance)
all_teams_attendance.extend(away_teams_attendance)
all_teams_attendance_set=set(all_teams_attendance)
different_teams_attendance=all_teams_attendance_set-all_teams_matches_set
different_teams_attendance
{'China PR',
"Côte d'Ivoire",
'FR Yugoslavia',
'Germany DR',
'IR Iran',
'Korea DPR',
'Korea Republic',
'Türkiye'}
different_teams_matches=all_teams_matches_set-all_teams_attendance_set
list(different_teams_matches)
['Ivory Coast', 'North Korea', 'Iran', 'East Germany', 'South Korea', 'Turkey', 'China']
correct_name_dict={
"China PR":"China",
"Côte d'Ivoire":"Ivory Coast",
"FR Yugoslavia":"Yugoslavia",
"Germany DR":"East Germany",
"IR Iran":"Iran",
"Korea DPR":"North Korea",
"Korea Republic":"South Korea",
"Türkiye":"Turkey",
}
correct_attendance_home_teams=[]
correct_attendance_away_teams=[]
correct_matches_home_teams=[]
correct_matches_away_teams=[]
seperaion_germany_date= datetime.datetime.strptime("1950-01-01", '%Y-%m-%d').date()
union_germany_date = datetime.datetime.strptime("1994-01-01", '%Y-%m-%d').date()
def correct_attendance_name(lables):
home_team=lables[0]
away_team=lables[1]
date=lables[2]
d = datetime.datetime.strptime(date, '%Y-%m-%d').date()
if home_team in correct_name_dict:
home_team=correct_name_dict[home_team]
elif home_team =="Germany" and d < union_germany_date and d > seperaion_germany_date:
home_team="West Germany"
if away_team in correct_name_dict:
away_team=correct_name_dict[away_team]
elif away_team =="Germany" and d < union_germany_date and d > seperaion_germany_date:
away_team="West Germany"
correct_attendance_home_teams.append(home_team)
correct_attendance_away_teams.append(away_team)
attendance_dataset[['home_team','away_team','Date']].apply(correct_attendance_name,axis=1)
0 None
1 None
2 None
3 None
4 None
...
959 None
960 None
961 None
962 None
963 None
Length: 964, dtype: object
attendance_dataset['home_team']=correct_attendance_home_teams
attendance_dataset['away_team']=correct_attendance_away_teams
data=pd.merge(matches,stadiums[['stadium_capacity','stadium_id']],on='stadium_id')
data.insert(15, 'stadium_capacity', data.pop('stadium_capacity'))
def fill_attendence(match):
attendance=attendance_dataset.loc[((attendance_dataset.home_team==match[0]) | (attendance_dataset.away_team==match[0])) & (attendance_dataset.Date==match[2])]["Attendance"]
return int(attendance)
attendance_values=data[['home_team','away_team','Date']].apply(fill_attendence,axis=1)
data['Attendance']=attendance_values
attendance_dataset.iloc[::-1].iloc[:15]
| home_team | away_team | Attendance | Date | |
|---|---|---|---|---|
| 963 | France | Mexico | 4444 | 1930-07-13 |
| 962 | United States | Belgium | 18346 | 1930-07-13 |
| 961 | Romania | Peru | 2549 | 1930-07-14 |
| 960 | Yugoslavia | Brazil | 24059 | 1930-07-14 |
| 959 | Argentina | France | 23409 | 1930-07-15 |
| 958 | Chile | Mexico | 9249 | 1930-07-16 |
| 957 | United States | Paraguay | 18306 | 1930-07-17 |
| 956 | Yugoslavia | Bolivia | 18306 | 1930-07-17 |
| 955 | Uruguay | Peru | 57735 | 1930-07-18 |
| 954 | Argentina | Mexico | 42100 | 1930-07-19 |
| 953 | Chile | France | 2000 | 1930-07-19 |
| 952 | Paraguay | Belgium | 12000 | 1930-07-20 |
| 951 | Brazil | Bolivia | 25466 | 1930-07-20 |
| 950 | Uruguay | Romania | 70022 | 1930-07-21 |
| 949 | Argentina | Chile | 41459 | 1930-07-22 |
stadiums.loc[(stadiums.stadium_id=="S-193")|(stadiums.stadium_id=="S-192")|(stadiums.stadium_id=="S-191")][['stadium_capacity','stadium_id']]
| stadium_capacity | stadium_id | |
|---|---|---|
| 190 | 90000 | S-191 |
| 191 | 20000 | S-192 |
| 192 | 10000 | S-193 |
data[['home_team','away_team','Date','stadium_capacity','stadium_id','Attendance']].iloc[:15]
| home_team | away_team | Date | stadium_capacity | stadium_id | Attendance | |
|---|---|---|---|---|---|---|
| 0 | France | Mexico | 1930-07-13 | 10000 | S-193 | 4444 |
| 1 | Romania | Peru | 1930-07-14 | 10000 | S-193 | 2549 |
| 2 | United States | Belgium | 1930-07-13 | 20000 | S-192 | 18346 |
| 3 | Yugoslavia | Brazil | 1930-07-14 | 20000 | S-192 | 24059 |
| 4 | Argentina | France | 1930-07-15 | 20000 | S-192 | 23409 |
| 5 | Chile | Mexico | 1930-07-16 | 20000 | S-192 | 9249 |
| 6 | Yugoslavia | Bolivia | 1930-07-17 | 20000 | S-192 | 18306 |
| 7 | United States | Paraguay | 1930-07-17 | 20000 | S-192 | 18306 |
| 8 | Uruguay | Peru | 1930-07-18 | 90000 | S-191 | 57735 |
| 9 | Chile | France | 1930-07-19 | 90000 | S-191 | 2000 |
| 10 | Argentina | Mexico | 1930-07-19 | 90000 | S-191 | 42100 |
| 11 | Brazil | Bolivia | 1930-07-20 | 90000 | S-191 | 25466 |
| 12 | Paraguay | Belgium | 1930-07-20 | 90000 | S-191 | 12000 |
| 13 | Uruguay | Romania | 1930-07-21 | 90000 | S-191 | 70022 |
| 14 | Argentina | Chile | 1930-07-22 | 90000 | S-191 | 41459 |
players.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8485 entries, 0 to 8484 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 key_id 8485 non-null int64 1 player_id 8485 non-null object 2 family_name 8485 non-null object 3 given_name 8485 non-null object 4 birth_date 8484 non-null object 5 goal_keeper 8485 non-null int64 6 defender 8485 non-null int64 7 midfielder 8485 non-null int64 8 forward 8485 non-null int64 9 count_tournaments 8485 non-null int64 10 list_tournaments 8485 non-null object 11 player_wikipedia_link 8485 non-null object dtypes: int64(6), object(6) memory usage: 795.6+ KB
players_teams=players[['player_id','given_name','family_name','count_tournaments','list_tournaments']]
teams_name=[]
teams_code=[]
teams_nums=[]
def get_players_teams(labels):
player_id=labels[0]
player_appereance=squads.loc[squads.player_id==player_id]
player_teams_name=player_appereance.team_name.unique().tolist()
player_teams_code=player_appereance.team_code.unique().tolist()
teams_name.append(player_teams_name)
teams_code.append(player_teams_code)
teams_nums.append(len(player_teams_name))
players_teams[['player_id']].apply(get_players_teams,axis=1)
players_teams['teams_name']=teams_name
players_teams['teams_code']=teams_code
players_teams['teams_nums']=teams_nums
D:\Programs\miniconda3\envs\BestEnv\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy D:\Programs\miniconda3\envs\BestEnv\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until D:\Programs\miniconda3\envs\BestEnv\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy after removing the cwd from sys.path.
players_teams[players_teams.teams_nums >1].iloc[:10]
| player_id | given_name | family_name | count_tournaments | list_tournaments | teams_name | teams_code | teams_nums | |
|---|---|---|---|---|---|---|---|---|
| 331 | P-01512 | José | Altafini | 2 | 1958, 1962 | [Brazil, Italy] | [BRA, ITA] | 2 |
| 937 | P-01675 | Thomas | Berthold | 3 | 1986, 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 1039 | P-03207 | Alen | Bokšić | 2 | 1990, 2002 | [Yugoslavia, Croatia] | [YUG, HRV] | 2 |
| 1087 | P-05308 | Aleksandr | Borodyuk | 2 | 1990, 1994 | [Soviet Union, Russia] | [SUN, RUS] | 2 |
| 1166 | P-00830 | Andreas | Brehme | 3 | 1986, 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 1227 | P-05379 | Guido | Buchwald | 2 | 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 2037 | P-02845 | Attilio | Demaría | 2 | 1930, 1934 | [Argentina, Italy] | [ARG, ITA] | 2 |
| 3036 | P-06666 | Sergei | Gorlukovich | 2 | 1990, 1994 | [Soviet Union, Russia] | [SUN, RUS] | 2 |
| 3294 | P-02040 | Thomas | Häßler | 3 | 1990, 1994, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 3572 | P-04641 | Bodo | Illgner | 2 | 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
def total_goals_in_match(labels):
total=labels[0]+labels[1]
return total;
data['total_goals_in_match']=data[['home_team_score','away_team_score']].apply(total_goals_in_match,axis=1)
def match_for_host(labels):
home_team=labels[0]
away_team=labels[1]
host=labels[2]
if home_team ==host:
return True
elif away_team==host:
return True
return False
data['match_for_host']=data[['home_team','away_team','country_name']].apply(match_for_host,axis=1)
def used_capacity_ratio(labels):
capacity=labels[0]
attendance=labels[1]
return capacity/attendance
data['used_capacity_ratio']=data[['stadium_capacity','Attendance']].apply(used_capacity_ratio,axis=1)
data['attendance_category']=pd.qcut(data['Attendance'],q=4,labels=['Low',"Medium","High","Very High"])
data['relative_attendance_category']=pd.qcut(data['used_capacity_ratio'],q=4,labels=['Very High',"High","Medium","Low"])
data[['Attendance','stadium_capacity','attendance_category','relative_attendance_category']]
| Attendance | stadium_capacity | attendance_category | relative_attendance_category | |
|---|---|---|---|---|
| 0 | 4444 | 10000 | Low | Low |
| 1 | 2549 | 10000 | Low | Low |
| 2 | 18346 | 20000 | Low | Medium |
| 3 | 24059 | 20000 | Low | Very High |
| 4 | 23409 | 20000 | Low | Very High |
| ... | ... | ... | ... | ... |
| 959 | 41823 | 44000 | Medium | High |
| 960 | 39789 | 44000 | Medium | Medium |
| 961 | 41232 | 44000 | Medium | High |
| 962 | 43443 | 44000 | High | High |
| 963 | 42523 | 44000 | Medium | High |
964 rows × 4 columns
name_code_dict={}
for name,code in zip(data['away_team'],data['away_team_code']):
name_code_dict[name]=code
def host_country_code(labels):
country_name=labels[0]
return name_code_dict[country_name]
data['host_country_code']=data[['country_name']].apply(host_country_code,axis=1)
def tournament_year(labels):
date=labels[0]
d=datetime.datetime.strptime(date,'%Y-%m-%d')
return d.year
data['tournament_year']=data[['Date']].apply(tournament_year,axis=1)
def full_name(labels):
first_name=labels[0]
last_name=labels[1]
return " ".join([first_name,last_name])
players["full_name"]=players[['given_name','family_name']].apply(full_name,axis=1)
def short_stage_name(labels):
g=labels[0]
k=labels[1]
if g==1 :
return "Group Match"
elif k==1:
return "Knockout Match"
data['short_stage_name']=data[['group_stage','knockout_stage']].apply(short_stage_name,axis=1)
def winner_code(labels):
winner_name=labels[0]
return name_code_dict[winner_name]
tournaments['winner_code']=tournaments[['winner']].apply(winner_code,axis=1)
last_goals=[]
def last_goal(labels):
match_id=labels[0]
last_goal=goals[goals.match_id==match_id]['minute_label'].tolist()
if len(last_goal) ==0:
last_goals.append(0)
return 0
return last_goal.pop()
data['last_goal']=data[['match_id']].apply(last_goal,axis=1)
def late_goal(labels):
last_goal=labels[0]
extra_time=labels[1]
penalty_shootout=labels[2]
if last_goal==0:
return "No Late Goal"
#no extra time like just 86'
if len(last_goal)<4:
if int(last_goal.split('\'')[0]) >85 and extra_time ==0:
return "Late Goal"
else:
return "No Late Goal"
# extra time like 90'+4',120'+2' etc..
elif len(last_goal)>=4:
if (int(last_goal.split('\'')[0])>=90 and extra_time==0) or (int(last_goal.split('\'')[0])>115 and penalty_shootout==0):
return "Late Goal"
else:
return "No Late Goal"
return "No Late Goal"
data['late_goal']=data[['last_goal','extra_time','penalty_shootout']].apply(late_goal,axis=1)
data.iloc[0:,-15:]
| result | home_team_win | away_team_win | draw | Attendance | total_goals_in_match | match_for_host | used_capacity_ratio | attendance_category | relative_attendance_category | host_country_code | tournament_year | short_stage_name | last_goal | late_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | home team win | 1 | 0 | 0 | 4444 | 5 | False | 2.250225 | Low | Low | URY | 1930 | Group Match | 87' | Late Goal |
| 1 | home team win | 1 | 0 | 0 | 2549 | 4 | False | 3.923107 | Low | Low | URY | 1930 | Group Match | 89' | Late Goal |
| 2 | home team win | 1 | 0 | 0 | 18346 | 3 | False | 1.090156 | Low | Medium | URY | 1930 | Group Match | 69' | No Late Goal |
| 3 | home team win | 1 | 0 | 0 | 24059 | 3 | False | 0.831290 | Low | Very High | URY | 1930 | Group Match | 62' | No Late Goal |
| 4 | home team win | 1 | 0 | 0 | 23409 | 1 | False | 0.854372 | Low | Very High | URY | 1930 | Group Match | 81' | No Late Goal |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 959 | away team win | 0 | 1 | 0 | 41823 | 1 | False | 1.052053 | Medium | High | QAT | 2022 | Group Match | 23' | No Late Goal |
| 960 | draw | 0 | 0 | 1 | 39789 | 6 | False | 1.105833 | Medium | Medium | QAT | 2022 | Group Match | 66' | No Late Goal |
| 961 | home team win | 1 | 0 | 0 | 41232 | 1 | False | 1.067132 | Medium | High | QAT | 2022 | Group Match | 60' | No Late Goal |
| 962 | away team win | 0 | 1 | 0 | 43443 | 2 | False | 1.012821 | High | High | QAT | 2022 | Group Match | 32' | No Late Goal |
| 963 | away team win | 0 | 1 | 0 | 42523 | 2 | False | 1.034734 | Medium | High | QAT | 2022 | Knockout Match | 55' | No Late Goal |
964 rows × 15 columns
my_data = data[['Attendance','tournament_year']].groupby(['tournament_year']).mean().reset_index()
my_data = my_data.rename(columns = {"Attendance": 'mean'})
my_data['median'] = data[['Attendance','tournament_year']].groupby(['tournament_year']).median().reset_index().Attendance
fig = px.line(my_data,x='tournament_year',y=['mean','median'],title='Attendance Over The Years')
fig.show()
fig = px.histogram(data,x='Attendance',nbins=50)
fig.show()
fig = px.box(data,x='tournament_year',y='Attendance')
fig.show()
def convert_minute_label_to_int(labels):
minute_label=labels[0]
if len(minute_label)<=4:
return int(minute_label.split('\'')[0])
else:
return int(minute_label.split('\'')[0])+ int(minute_label.split('\'')[1][1])
goals['minute_label_int']=goals[['minute_label']].apply(convert_minute_label_to_int,axis=1)
my_data = goals[['minute_label_int','tournament_id']].groupby(['tournament_id']).mean().reset_index()
my_data = my_data.rename(columns = {'minute_label_int':'mean'})
fig = px.bar(my_data,x='tournament_id',y='mean')
fig.show()
my_data = goals[['goal_id','match_id']].groupby(['match_id']).size().reset_index(name='count')
fig = px.histogram(my_data, y='count',x='match_id')
fig.show()
def get_most_repeated_goals(minute_labels):
return minute_labels['minute_label_int'].mode()
my_data=goals[['tournament_id','minute_label','minute_label_int']].groupby(['tournament_id']).apply(get_most_repeated_goals).reset_index()
my_data.rename(columns={0:'most_repeated_minute'},inplace=True)
fig = px.scatter(my_data, y="most_repeated_minute",x='tournament_id')
fig.show()
my_data=data[['late_goal','tournament_id']].groupby(['tournament_id']).size().reset_index(name='count')
fig = px.histogram(my_data, y='count',x='tournament_id')
fig.show()
my_data=goals[['given_name','family_name','goal_id']].groupby(['given_name','family_name']).size().reset_index(name='count').sort_values('count').reset_index()
my_data = my_data.iloc[-12:]
fig = px.bar(my_data, y='count',x='family_name')
fig.show()
sepeated_data=goals.groupby(['tournament_id','given_name','family_name']).size().reset_index(name='count')
final_data=sepeated_data.groupby('tournament_id')
top_scorer={}
for name,group in final_data:
group.reset_index(inplace=True)
index=group['count'].idxmax()
player_name=group['family_name'].iloc[index]
top_scorer[name + ' / '+ player_name]=group['count'].max()
fig = px.bar(y=top_scorer.values(), x=top_scorer.keys())
fig.show()
my_data = goals.groupby(['tournament_id']).size().reset_index(name='count')
fig = px.bar(my_data, y='count',x='tournament_id')
fig.show()
#East Germany,West Germany , Germany , Brazil , Italy:
specific_teams=goals.loc[(goals.team_name=='East Germany') |(goals.team_name=='West Germany') | (goals.team_name=="Germany") | (goals.team_name == "Italy") | (goals.team_name=="Brazil")]
fig = px.strip(specific_teams, y='minute_label_int',x='stage_name')
fig.show()
all_matches={}
def get_most_repeated_matches(labels):
home_team=labels[0]
away_team=labels[1]
full_team= home_team + ' vs ' + away_team
if full_team not in all_matches:
all_matches[full_team]=1
else:
all_matches[full_team]=all_matches[full_team]+1
matches[['home_team','away_team']].apply(get_most_repeated_matches,axis=1)
0 None
1 None
2 None
3 None
4 None
...
959 None
960 None
961 None
962 None
963 None
Length: 964, dtype: object
most_repeated_matchessorted=sorted(all_matches.items(),key=lambda x:x[1])[-10:]
repeated_matches=[x[1] for x in most_repeated_matchessorted[-12:]]
repeated_num=[x[0] for x in most_repeated_matchessorted[-12:]]
fig = px.bar(y=repeated_matches, x=repeated_num)
fig.show()
players_teams.loc[players_teams['teams_nums']>=2][players_teams.columns]
| player_id | given_name | family_name | count_tournaments | list_tournaments | teams_name | teams_code | teams_nums | |
|---|---|---|---|---|---|---|---|---|
| 331 | P-01512 | José | Altafini | 2 | 1958, 1962 | [Brazil, Italy] | [BRA, ITA] | 2 |
| 937 | P-01675 | Thomas | Berthold | 3 | 1986, 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 1039 | P-03207 | Alen | Bokšić | 2 | 1990, 2002 | [Yugoslavia, Croatia] | [YUG, HRV] | 2 |
| 1087 | P-05308 | Aleksandr | Borodyuk | 2 | 1990, 1994 | [Soviet Union, Russia] | [SUN, RUS] | 2 |
| 1166 | P-00830 | Andreas | Brehme | 3 | 1986, 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 1227 | P-05379 | Guido | Buchwald | 2 | 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 2037 | P-02845 | Attilio | Demaría | 2 | 1930, 1934 | [Argentina, Italy] | [ARG, ITA] | 2 |
| 3036 | P-06666 | Sergei | Gorlukovich | 2 | 1990, 1994 | [Soviet Union, Russia] | [SUN, RUS] | 2 |
| 3294 | P-02040 | Thomas | Häßler | 3 | 1990, 1994, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 3572 | P-04641 | Bodo | Illgner | 2 | 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 3719 | P-04605 | Robert | Jarni | 3 | 1990, 1998, 2002 | [Yugoslavia, Croatia] | [YUG, HRV] | 2 |
| 4131 | P-06604 | Jürgen | Klinsmann | 3 | 1990, 1994, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 4155 | P-08277 | Jürgen | Kohler | 3 | 1990, 1994, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 4200 | P-03556 | Andreas | Köpke | 3 | 1990, 1994, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 4968 | P-09502 | Lothar | Matthäus | 5 | 1982, 1986, 1990, 1994, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 5200 | P-09293 | Savo | Milošević | 2 | 1998, 2006 | [Yugoslavia, Serbia and Montenegro] | [YUG, SCG] | 2 |
| 5280 | P-01103 | Andreas | Möller | 3 | 1990, 1994, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 5304 | P-02543 | Luis | Monti | 2 | 1930, 1934 | [Argentina, Italy] | [ARG, ITA] | 2 |
| 6319 | P-02576 | Robert | Prosinečki | 3 | 1990, 1998, 2002 | [Yugoslavia, Croatia] | [YUG, HRV] | 2 |
| 6340 | P-01259 | Ferenc | Puskás | 2 | 1954, 1962 | [Hungary, Spain] | [HUN, ESP] | 2 |
| 6396 | P-01757 | Rudolf | Raftl | 2 | 1934, 1938 | [Austria, Germany] | [AUT, DEU] | 2 |
| 6506 | P-06121 | Stefan | Reuter | 2 | 1990, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 6545 | P-07779 | Karl-Heinz | Riedle | 2 | 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 6903 | P-05394 | José | Santamaría | 2 | 1954, 1962 | [Uruguay, Spain] | [URY, ESP] | 2 |
| 6974 | P-09633 | Willibald | Schmaus | 2 | 1934, 1938 | [Austria, Germany] | [AUT, DEU] | 2 |
| 7353 | P-02369 | Dejan | Stanković | 3 | 1998, 2006, 2010 | [Yugoslavia, Serbia and Montenegro, Serbia] | [YUG, SCG, SRB] | 3 |
| 7403 | P-08793 | Vladimir | Stojković | 3 | 2006, 2010, 2018 | [Serbia and Montenegro, Serbia] | [SCG, SRB] | 2 |
| 7424 | P-08975 | Josef | Stroh | 2 | 1934, 1938 | [Austria, Germany] | [AUT, DEU] | 2 |
| 7451 | P-01739 | Davor | Šuker | 3 | 1990, 1998, 2002 | [Yugoslavia, Croatia] | [YUG, HRV] | 2 |
| 7620 | P-07346 | Olaf | Thon | 3 | 1986, 1990, 1998 | [West Germany, Germany] | [DEU] | 2 |
| 8030 | P-06299 | Nemanja | Vidić | 2 | 2006, 2010 | [Serbia and Montenegro, Serbia] | [SCG, SRB] | 2 |
| 8093 | P-02452 | Rudi | Völler | 3 | 1986, 1990, 1994 | [West Germany, Germany] | [DEU] | 2 |
| 8131 | P-00537 | Franz | Wagner | 2 | 1934, 1938 | [Austria, Germany] | [AUT, DEU] | 2 |
| 8443 | P-02502 | Nikola | Žigić | 2 | 2006, 2010 | [Serbia and Montenegro, Serbia] | [SCG, SRB] | 2 |
def host_winner(labels):
host=labels[0]
winner=labels[1]
if host == winner:
return 1
else:
return 0
tournaments['host_winner']=tournaments[['host_country','winner']].apply(host_winner,axis=1)
grouped_data = tournaments.groupby(['host_winner']).size().reset_index(name='count')
px.bar(x=['host_didnt_win','host_win'],y=[grouped_data['count'].tolist()])
grouped_home_data = data[['home_team','Attendance']].groupby(['home_team']).sum().reset_index()
grouped_home_data.rename(columns={'Attendance':'home_attendance'},inplace=True)
grouped_away_data = data[['away_team','Attendance']].groupby(['away_team']).sum().reset_index()
grouped_away_data.rename(columns={'Attendance':'away_attendance'},inplace=True)
grouped_team_data=pd.concat([grouped_home_data,grouped_away_data],axis=1)
grouped_team_data.dropna(inplace=True)
fig = px.line(grouped_team_data,x='home_team',y=['home_attendance','away_attendance'],title='home attendance vs away attendance')
fig.show()
def get_match_for_host(labels):
home_team=labels[0]
away_team=labels[1]
country=labels[2]
if home_team==country or away_team == country:
return True
else:
return False
host_is_playing=data[['home_team','away_team','country_name']].apply(get_match_for_host,axis=1)
host_data=data.loc[host_is_playing][['home_team','away_team','country_name','Attendance']]
host_attend=host_data.groupby('country_name').sum().reset_index()
host_attend['number_of_matches']=host_data.groupby('country_name').size().reset_index(name='number_of_mathces')["number_of_mathces"]
host_attend['mean_values']=host_attend['Attendance']//host_attend['number_of_matches']
def get_match_for_not_host(labels):
home_team=labels[0]
away_team=labels[1]
country=labels[2]
if home_team!=country and away_team != country and (home_team in data.country_name.unique().tolist() or away_team in data.country_name.unique().tolist()):
return True
else:
return False
host_is_not_playing=data[['home_team','away_team','country_name']].apply(get_match_for_not_host,axis=1)
not_host_data=data.loc[host_is_not_playing][['home_team','away_team','country_name','Attendance']]
not_host_attend=not_host_data.groupby('country_name').sum().reset_index()
not_host_attend['number_of_matches']=not_host_data.groupby('country_name').size().reset_index(name='number_of_mathces')["number_of_mathces"]
not_host_attend['mean_values']=not_host_attend['Attendance']//not_host_attend['number_of_matches']
final_data=pd.DataFrame({'country':host_attend.country_name,'host_mean_values':host_attend.mean_values,'not_host_mean_values':not_host_attend.mean_values})
fig = px.line(final_data,x='country',y=['host_mean_values','not_host_mean_values'],title='host attendance vs not host| attendance')
fig.show()
teams_players=pd.DataFrame(columns={'team_name','players_id','tournament_id'})
ids=[]
names=[]
tournament_ids=[]
for name,group in squads.groupby(['tournament_id',"team_name"]):
group.reset_index()
ids.append(group.player_id.tolist())
names.append(name[1])
tournament_ids.append(name[0])
teams_players['team_name']=names
teams_players['players_id']=ids
teams_players['tournament_id']=tournament_ids
teams_players.insert(2,'players_id',teams_players.pop('players_id'))
winner_team_players_ids=pd.DataFrame(columns=['tournament_id','winner_team','players_ids'])
teams=[]
ids=[]
tournament_ids=[]
for index,row in tournaments[['tournament_id','winner']].iterrows():
tournament_id=row[0]
team_name=row[1]
id_players=teams_players.loc[(teams_players['tournament_id']==tournament_id) & (teams_players['team_name']==team_name)]['players_id'].tolist()
teams.append(team_name)
tournament_ids.append(tournament_id)
ids.append(id_players[0])
winner_team_players_ids['tournament_id']=tournament_ids
winner_team_players_ids['winner_team']=teams
winner_team_players_ids['players_ids']=ids
def get_mean_ages(labels):
ages=[]
for player_id in labels[0]:
birth=players.loc[players['player_id']==player_id]['birth_date'].tolist()[0]
birth_year=int(birth.split('-')[0])
tour_year=int(labels[1].split('-')[1])
age=tour_year-birth_year
ages.append(age)
return np.array(ages).mean()
winner_team_players_ids['mean_ages']=winner_team_players_ids[['players_ids','tournament_id']].apply(get_mean_ages,axis=1)
winner_team_players_ids
| tournament_id | winner_team | players_ids | mean_ages | |
|---|---|---|---|---|
| 0 | WC-1930 | Uruguay | [P-04671, P-09960, P-00948, P-09427, P-01245, ... | 26.681818 |
| 1 | WC-1934 | Italy | [P-06155, P-08634, P-09236, P-04825, P-07663, ... | 28.409091 |
| 2 | WC-1938 | Italy | [P-05226, P-02232, P-08903, P-02773, P-07300, ... | 26.500000 |
| 3 | WC-1950 | Uruguay | [P-00352, P-01717, P-03413, P-09441, P-09730, ... | 25.681818 |
| 4 | WC-1954 | West Germany | [P-05844, P-08245, P-08247, P-03533, P-02287, ... | 28.045455 |
| 5 | WC-1958 | Brazil | [P-02577, P-08512, P-02385, P-02499, P-07958, ... | 25.954545 |
| 6 | WC-1962 | Brazil | [P-02385, P-02499, P-08400, P-09928, P-08419, ... | 28.136364 |
| 7 | WC-1966 | England | [P-00753, P-07792, P-00049, P-08377, P-06862, ... | 27.227273 |
| 8 | WC-1970 | Brazil | [P-08384, P-07366, P-08598, P-04742, P-06480, ... | 25.045455 |
| 9 | WC-1974 | West Germany | [P-07375, P-00117, P-02579, P-07938, P-08834, ... | 27.318182 |
| 10 | WC-1978 | Argentina | [P-07746, P-02797, P-09250, P-01875, P-01242, ... | 26.272727 |
| 11 | WC-1982 | Italy | [P-03649, P-02667, P-02782, P-06485, P-07960, ... | 27.318182 |
| 12 | WC-1986 | Argentina | [P-08578, P-09253, P-05640, P-04444, P-00625, ... | 27.045455 |
| 13 | WC-1990 | West Germany | [P-04641, P-06121, P-00830, P-08277, P-08719, ... | 27.772727 |
| 14 | WC-1994 | Brazil | [P-07055, P-07794, P-06428, P-00131, P-00431, ... | 27.772727 |
| 15 | WC-1998 | France | [P-08621, P-02758, P-08661, P-01388, P-09168, ... | 27.454545 |
| 16 | WC-2002 | Brazil | [P-00168, P-04884, P-05789, P-07081, P-04820, ... | 26.739130 |
| 17 | WC-2006 | Italy | [P-00677, P-09031, P-02751, P-00192, P-08335, ... | 28.782609 |
| 18 | WC-2010 | Spain | [P-08599, P-02614, P-08653, P-00806, P-09494, ... | 26.304348 |
| 19 | WC-2014 | Germany | [P-09934, P-05176, P-02378, P-08606, P-03098, ... | 26.217391 |
| 20 | WC-2018 | France | [P-09916, P-09949, P-07157, P-00846, P-02672, ... | 26.086957 |
| 21 | WC-2022 | Argentina | [P-00773, P-09642, P-02432, P-03476, P-03419, ... | 27.807692 |
fig = px.bar(winner_team_players_ids,x='tournament_id',y='mean_ages')
fig.show()